{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-10-05 14:30:22.496628: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA\n",
      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
      "2023-10-05 14:30:22.570469: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
      "2023-10-05 14:30:22.960050: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory\n",
      "2023-10-05 14:30:22.960097: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory\n",
      "2023-10-05 14:30:22.960099: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n",
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    }
   ],
   "source": [
    "from transformers import GPT2Tokenizer\n",
    "from transformers import (\n",
    "    AutoTokenizer,\n",
    "    T5ForConditionalGeneration,\n",
    "    AutoConfig,\n",
    ")\n",
    "from tokenizers import AddedToken\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained('mesolitica/jawi-nanot5-tiny-malaysian-cased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('./out-super-tiny-1.1/tokenizer_config.json',\n",
       " './out-super-tiny-1.1/special_tokens_map.json',\n",
       " './out-super-tiny-1.1/vocab.json',\n",
       " './out-super-tiny-1.1/merges.txt',\n",
       " './out-super-tiny-1.1/added_tokens.json',\n",
       " './out-super-tiny-1.1/tokenizer.json')"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.save_pretrained('./out-super-tiny-1.1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b1650de131cc4dbcb31fe802fb0584c4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading (…)lve/main/config.json:   0%|          | 0.00/605 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "config = AutoConfig.from_pretrained(\n",
    "    'google/t5-v1_1-base'\n",
    ")\n",
    "config.dropout_rate = 0.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "config.d_ff = 1024\n",
    "config.num_heads = 6\n",
    "config.num_layers = 2\n",
    "config.d_model = 256\n",
    "config.num_decoder_layers = 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = T5ForConditionalGeneration(\n",
    "    config,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.save_pretrained('./out-super-tiny-1.1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total 86M\r\n",
      "-rw-r--r-- 1 husein husein 2.6K Okt   5 14:30 added_tokens.json\r\n",
      "-rw-r--r-- 1 husein husein  794 Okt   5 14:31 config.json\r\n",
      "-rw-r--r-- 1 husein husein  147 Okt   5 14:31 generation_config.json\r\n",
      "-rw-r--r-- 1 husein husein 297K Okt   5 14:30 merges.txt\r\n",
      "-rw-r--r-- 1 husein husein  84M Okt   5 14:31 pytorch_model.bin\r\n",
      "-rw-r--r-- 1 husein husein 2.2K Okt   5 14:30 special_tokens_map.json\r\n",
      "-rw-r--r-- 1 husein husein  21K Okt   5 14:30 tokenizer_config.json\r\n",
      "-rw-r--r-- 1 husein husein 1.4M Okt   5 14:30 tokenizer.json\r\n",
      "-rw-r--r-- 1 husein husein 506K Okt   5 14:30 vocab.json\r\n"
     ]
    }
   ],
   "source": [
    "!ls -lh out-super-tiny-1.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
